%load_ext pretty_jupyter

Introduction

In diesem Notebook wenden wir Applied Machine Learning (AML) Techniken an, um effektive Strategien für personalisierte Kreditkarten-Werbekampagnen zu entwickeln. Unser Ziel ist es, mithilfe von Kunden- und Transaktionsdaten präzise Modelle zu erstellen, die die Wahrscheinlichkeit des Kreditkartenkaufs vorhersagen.

Lib Importing

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from itables import init_notebook_mode
from datetime import datetime

init_notebook_mode(all_interactive=True)

Load the Data

account = pd.read_csv("account.csv", sep=";", dtype={"date": "str"})
account["date"] = pd.to_datetime(account["date"], format="%y%m%d")

card = pd.read_csv("card.csv", sep=";", dtype={"issued": "str"})
# Man kann die Zeit weglassen da immer 00:00:00
card["issued"] = pd.to_datetime(card["issued"].str[:6], format="%y%m%d")

client = pd.read_csv("client.csv", sep=";")
disp = pd.read_csv("disp.csv", sep=";")
district = pd.read_csv("district.csv", sep=";")

loan = pd.read_csv("loan.csv", sep=";", dtype={"date": "str"})
loan["date"] = pd.to_datetime(loan["date"], format="%y%m%d")


order = pd.read_csv("order.csv", sep=";")

trans = pd.read_csv("trans.csv", sep=";", low_memory=False, dtype={"date": "str"})
trans["date"] = pd.to_datetime(trans["date"], format="%y%m%d")

EDA

Account

account
account_id district_id frequency date
Loading... (need help?)

Card

card
card_id disp_id type issued
Loading... (need help?)

Client

client
client_id birth_number district_id
Loading... (need help?)

Disp

disp
disp_id client_id account_id type
Loading... (need help?)

District

district
A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 A14 A15 A16
Loading... (need help?)

Loan

loan
loan_id account_id date amount duration payments status
Loading... (need help?)

Order

order
order_id account_id bank_to account_to amount k_symbol
Loading... (need help?)

Trans

trans
trans_id account_id date type operation amount balance k_symbol bank account
Loading... (need help?)

Transformations

data_frames = {}

Account

# Frequency Transformation
account["frequency"] = account["frequency"].replace(
    {
        "POPLATEK MESICNE": "MONTHLY ISSUANCE",
        "POPLATEK TYDNE": "WEEKLY ISSUANCE",
        "POPLATEK PO OBRATU": "ISSUANCE AFTER TRANSACTION",
    }
)

# Rename Column
account = account.rename(columns={"frequency": "issuance_statement_frequency"})

# Convert Date Column to datetime format
account["date"] = pd.to_datetime(account["date"])

# Assuming 'data_frames' is a dictionary of DataFrames
data_frames["account.csv"] = account

# Sample 5 random rows
account.sample(n=5)
account_id district_id issuance_statement_frequency date
Loading... (need help?)

Card

card["issued"] = pd.to_datetime(card["issued"], format="mixed")
data_frames["card.csv"] = card

Client

# Funktion zur Bestimmung des Geschlechts und Berechnung des Geburtstags
def parse_details(birth_number):
    birth_number_str = str(
        birth_number
    )  # Konvertiere birth_number zu einem String, falls notwendig
    year_prefix = "19"
    month = int(birth_number_str[2:4])
    gender = "female" if month > 12 else "male"
    if gender == "female":
        month -= 50
    year = int(year_prefix + birth_number_str[:2])
    day = int(birth_number_str[4:6])
    birth_day = datetime(year, month, day)
    return gender, birth_day


# Berechnung des Alters basierend auf einem Basisjahr
def calculate_age(birth_date, base_date=datetime(1999, 12, 31)):
    return (
        base_date.year
        - birth_date.year
        - ((base_date.month, base_date.day) < (birth_date.month, birth_date.day))
    )


# Anwenden der Funktionen und Erstellen neuer Spalten
client["gender"], client["birth_day"] = zip(
    *client["birth_number"].apply(parse_details)
)
client["age"] = client["birth_day"].apply(calculate_age)

data_frames["client.csv"] = client

# Auswahl spezifischer Spalten für die finale DataFrame (optional, je nach Bedarf)
# Sample 5 random rows
client.sample(n=5)
client_id birth_number district_id gender birth_day age
Loading... (need help?)

Disp

data_frames["disp.csv"] = disp

# random sample
disp.sample(n=5)
disp_id client_id account_id type
Loading... (need help?)

District

  • A1 district_id/district code
  • A2 district name
  • A3 region
  • A4 no. of inhabitants
  • A5 no. of municipalities with inhabitants < 499
  • A6 no. of municipalities with inhabitants 500-1999 A7 no. of municipalities with inhabitants 2000-9999
  • A8 no. of municipalities with inhabitants >10000
  • A9 no. of cities
  • A10 ratio of urban inhabitants
  • A11 average salary
  • A12 unemploymant rate ’95
  • A13 unemploymant rate ’96
  • A14 no. of enterpreneurs per 1000 inhabitants
  • A15 no. of commited crimes ’95
  • A16 no. of commited crimes ’96
import pandas as pd

# Assuming 'district' is your pandas DataFrame

# Renaming and selecting columns
district = district.rename(
    columns={
        "A1": "district_id",
        "A2": "district_name",
        "A3": "region",
        "A4": "num_of_habitat",
        "A5": "num_of_small_town",
        "A6": "num_of_medium_town",
        "A7": "num_of_big_town",
        "A8": "num_of_bigger_town",
        "A9": "num_of_city",
        "A10": "ratio_of_urban",
        "A11": "average_salary",
        "A12": "unemploy_rate95",
        "A13": "unemploy_rate96",
        "A14": "n_of_enterpren_per1000_inhabit",
        "A15": "no_of_crimes95",
        "A16": "no_of_crimes96",
    }
)[
    [
        "district_id",
        "district_name",
        "region",
        "num_of_habitat",
        "num_of_small_town",
        "num_of_medium_town",
        "num_of_big_town",
        "num_of_bigger_town",
        "num_of_city",
        "ratio_of_urban",
        "average_salary",
        "unemploy_rate95",
        "unemploy_rate96",
        "n_of_enterpren_per1000_inhabit",
        "no_of_crimes95",
        "no_of_crimes96",
    ]
]

data_frames["district.csv"] = district

district.sample(n=5)
district_id district_name region num_of_habitat num_of_small_town num_of_medium_town num_of_big_town num_of_bigger_town num_of_city ratio_of_urban average_salary unemploy_rate95 unemploy_rate96 n_of_enterpren_per1000_inhabit no_of_crimes95 no_of_crimes96
Loading... (need help?)
# Convert the 'date' column to datetime format
loan["date"] = pd.to_datetime(loan["date"], format="mixed")

# Mutate the 'status' column based on conditions
loan["status"] = loan["status"].map(
    {
        "A": "contract finished",
        "B": "finished contract, loan not paid",
        "C": "running contract",
        "D": "client in debt",
    }
)

# Group by 'account_id', calculate the number of loans, and sort the results
num_of_loan_df = (
    loan.groupby("account_id")
    .size()
    .reset_index(name="num_of_loan")
    .sort_values(by="num_of_loan", ascending=False)
)

# Display the resulting DataFrame
num_of_loan_df
account_id num_of_loan
Loading... (need help?)
# Perform an inner join between 'loan' and 'num_of_loan_df' on 'account_id'
loan = pd.merge(loan, num_of_loan_df, on="account_id", how="inner")

# Assign the resulting DataFrame to a dictionary for storage
data_frames["loan.csv"] = loan

# Sample 5 random rows from the joined DataFrame
loan.sample(n=100)
loan_id account_id date amount duration payments status num_of_loan
Loading... (need help?)